rm(list = ls())

library(stringr)
library(ggplot2)
library(quanteda)
library(quanteda.textplots)
library(dplyr)
library(tidylo)

library(tidyr)
library(tidytext)
library(bigrquery)
library(tidyverse)
library(readxl)
library(openxlsx)

##
endeligdata <- read_excel("INSERT WD/cleandata_text_US.xlsx") ##read cleaned US data here
endeligdata$q18 <-  str_trim(endeligdata$q18) #trim
endeligdata <- endeligdata[!(is.na(endeligdata$q18) | endeligdata$q18=="" ), ] ##delecting NA

newdata <- endeligdata %>%
  transmute(q18,D)

newdata$q18 <-str_replace_all(newdata$q18, "[^[:alnum:]]", " ")

newdata <- newdata[!(is.na(newdata$q18) | newdata$q18==" " ), ] ##delecting NA

# word frequencies in articles
toks <- newdata$q18 %>% 
  tokens(what = "word", # 'tokenize articles
         remove_numbers = T,
         remove_punct = T,
         remove_symbols = T,
         remove_separators = T,
         remove_hyphens = T,
         remove_url = T,
         verbose = T)

quanteda_options("language_stemmer" = "english") # set stemming language to English

toks <- toks %>% 
  tokens_tolower() %>% # all words in lower case
  tokens_remove(stopwords("english")) %>% # remove English 'stop words'
  tokens_wordstem() # stem all words

# get actual dfm from tokens
txt.mat <- dfm(toks)


# filter out three-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 3]

dict<- dictionary(list(race = c("racism","race","discrimin","racial","histor","histori","color","segreg","privileg","prejudic","slaveri","jim","crow"))) ##Making Intergroup dictionary in US


dict_work<- dictionary(list(effort = c("hard","effort","lazy","work","deserv","luck","chance"))) ##Making interpersonal dictionary in US

text<-dfm_lookup(txt.mat, dictionary = dict) ##look up number of times words from dictionary appears in a document

newdata$race <- as.vector(dfm_match(text, "race")) #adding varible to data frame

newdata$race_dum <- as.numeric(newdata$race !=0) #creating indicator varible to data frame

text<-dfm_lookup(txt.mat, dictionary = dict_work) ##look up number of times words from dictionary appears in a document

newdata$effort <- as.vector(dfm_match(text, "effort")) #adding varible to data frame

newdata$effort_dum <- as.numeric(newdata$effort !=0) #creating indicator varible to data frame

write.xlsx(newdata, "INSERT WD/dictionary_US.xlsx") #saving excel file to make Figure 1 


###Create figure SM18
visualize_horis <- newdata %>%
  transmute(q18,D,race,race_dum)

visualize_horis <- visualize_horis[ which(visualize_horis$D == "Horizontal ineq" ), ]
visualize_horis <- visualize_horis[ which(visualize_horis$race_dum == 1 ), ]

# word frequencies in articles
toks <- visualize_horis$q18 %>% 
  tokens(what = "word", # 'tokenize articles
         remove_numbers = T,
         remove_punct = T,
         remove_symbols = T,
         remove_separators = T,
         remove_hyphens = T,
         remove_url = T,
         verbose = T)

quanteda_options("language_stemmer" = "english") # set stemming language to Danish

toks <- toks %>% 
  tokens_tolower() %>% # all words in lower case
  tokens_remove(stopwords("english")) %>% # remove Danish 'stop words'
  tokens_wordstem() # stem all words

# get actual dfm from tokens
txt.mat <- dfm(toks)


col <- sapply(seq(0.1, 1, 0.1), function(x) adjustcolor("#1F78B4", x))

#Save figure SM18
pdf(file = "INSERT WD/figure_SM18.pdf", 
    width=3.5,
    height=3.5,
    compress=TRUE)  

textplot_wordcloud(txt.mat, adjust = 0.5, random_order = FALSE, 
                   color = col, rotation = FALSE)

dev.off()








